-
Notifications
You must be signed in to change notification settings - Fork 11.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OpenMP] Associate the KernelEnvironment with the GenericKernelTy #70383
[OpenMP] Associate the KernelEnvironment with the GenericKernelTy #70383
Conversation
2f0a9bc
to
f6d6661
Compare
@llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-codegen Author: Johannes Doerfert (jdoerfert) ChangesBy associating the kernel environment with the generic kernel we can Patch is 1.54 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70383.diff 25 Files Affected:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 6262b3416a1730a..c1be7c2d0321589 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6002,6 +6002,42 @@ void CGOpenMPRuntime::emitUsesAllocatorsFini(CodeGenFunction &CGF,
{ThreadId, AllocatorVal});
}
+void CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams(
+ const OMPExecutableDirective &D, CodeGenFunction &CGF,
+ int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal,
+ int32_t &MaxTeamsVal) {
+
+ getNumTeamsExprForTargetDirective(CGF, D, MinTeamsVal, MaxTeamsVal);
+ getNumThreadsExprForTargetDirective(CGF, D, MaxThreadsVal,
+ /*UpperBoundOnly=*/true);
+
+ for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
+ for (auto *A : C->getAttrs()) {
+ int32_t AttrMinThreadsVal = 1, AttrMaxThreadsVal = -1;
+ int32_t AttrMinBlocksVal = 1, AttrMaxBlocksVal = -1;
+ if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
+ CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &AttrMaxThreadsVal,
+ &AttrMinBlocksVal, &AttrMaxBlocksVal);
+ else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
+ CGM.handleAMDGPUFlatWorkGroupSizeAttr(
+ nullptr, Attr, /*ReqdWGS=*/nullptr, &AttrMinThreadsVal,
+ &AttrMaxThreadsVal);
+ else
+ continue;
+
+ MinThreadsVal = std::max(MinThreadsVal, AttrMinThreadsVal);
+ if (AttrMaxThreadsVal > 0)
+ MaxThreadsVal = MaxThreadsVal > 0
+ ? std::min(MaxThreadsVal, AttrMaxThreadsVal)
+ : AttrMaxThreadsVal;
+ MinTeamsVal = std::max(MinTeamsVal, AttrMinBlocksVal);
+ if (AttrMaxBlocksVal > 0)
+ MaxTeamsVal = MaxTeamsVal > 0 ? std::min(MaxTeamsVal, AttrMaxBlocksVal)
+ : AttrMaxBlocksVal;
+ }
+ }
+}
+
void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
@@ -6020,47 +6056,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
return CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc());
};
- // Get NumTeams and ThreadLimit attributes
- int32_t DefaultValMinTeams = 1;
- int32_t DefaultValMaxTeams = -1;
- uint32_t DefaultValMinThreads = 1;
- uint32_t DefaultValMaxThreads = UINT32_MAX;
-
- getNumTeamsExprForTargetDirective(CGF, D, DefaultValMinTeams,
- DefaultValMaxTeams);
- getNumThreadsExprForTargetDirective(CGF, D, DefaultValMaxThreads,
- /*UpperBoundOnly=*/true);
-
- for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
- for (auto *A : C->getAttrs()) {
- int32_t MinThreadsVal = 1, MaxThreadsVal = 0;
- int32_t MinBlocksVal = 1, MaxBlocksVal = -1;
- if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
- CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &MaxThreadsVal,
- &MinBlocksVal, &MaxBlocksVal);
- else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
- CGM.handleAMDGPUFlatWorkGroupSizeAttr(
- nullptr, Attr, /*ReqdWGS=*/nullptr, &MinThreadsVal, &MaxThreadsVal);
- else
- continue;
-
- DefaultValMinThreads =
- std::max(DefaultValMinThreads, uint32_t(MinThreadsVal));
- DefaultValMaxThreads =
- DefaultValMaxThreads
- ? std::min(DefaultValMaxThreads, uint32_t(MaxThreadsVal))
- : MaxThreadsVal;
- DefaultValMinTeams = DefaultValMinTeams
- ? std::max(DefaultValMinTeams, MinBlocksVal)
- : MinBlocksVal;
- DefaultValMaxTeams = std::min(DefaultValMaxTeams, MaxBlocksVal);
- }
- }
-
- OMPBuilder.emitTargetRegionFunction(
- EntryInfo, GenerateOutlinedFunction, DefaultValMinTeams,
- DefaultValMaxTeams, DefaultValMinThreads, DefaultValMaxThreads,
- IsOffloadEntry, OutlinedFn, OutlinedFnID);
+ OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
+ IsOffloadEntry, OutlinedFn, OutlinedFnID);
if (!OutlinedFn)
return;
@@ -6306,7 +6303,7 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
/// store the condition in \p CondVal. If \p E, and \p CondVal respectively, are
/// nullptr, no expression evaluation is perfomed.
static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
- const Expr **E, uint32_t &UpperBound,
+ const Expr **E, int32_t &UpperBound,
bool UpperBoundOnly, llvm::Value **CondVal) {
const Stmt *Child = CGOpenMPRuntime::getSingleCompoundChild(
CGF.getContext(), CS->getCapturedStmt());
@@ -6368,10 +6365,10 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
UpperBound
? Constant->getZExtValue()
: std::min(UpperBound,
- static_cast<uint32_t>(Constant->getZExtValue()));
+ static_cast<int32_t>(Constant->getZExtValue()));
// If we haven't found a upper bound, remember we saw a thread limiting
// clause.
- if (UpperBound == UINT32_MAX)
+ if (UpperBound == -1)
UpperBound = 0;
if (!E)
return;
@@ -6397,7 +6394,7 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
}
const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective(
- CodeGenFunction &CGF, const OMPExecutableDirective &D, uint32_t &UpperBound,
+ CodeGenFunction &CGF, const OMPExecutableDirective &D, int32_t &UpperBound,
bool UpperBoundOnly, llvm::Value **CondVal, const Expr **ThreadLimitExpr) {
assert((!CGF.getLangOpts().OpenMPIsTargetDevice || UpperBoundOnly) &&
"Clauses associated with the teams directive expected to be emitted "
@@ -6414,11 +6411,11 @@ const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective(
if (auto Constant = E->getIntegerConstantExpr(CGF.getContext()))
UpperBound = UpperBound ? Constant->getZExtValue()
: std::min(UpperBound,
- uint32_t(Constant->getZExtValue()));
+ int32_t(Constant->getZExtValue()));
}
// If we haven't found a upper bound, remember we saw a thread limiting
// clause.
- if (UpperBound == UINT32_MAX)
+ if (UpperBound == -1)
UpperBound = 0;
if (EPtr)
*EPtr = E;
@@ -6562,7 +6559,7 @@ llvm::Value *CGOpenMPRuntime::emitNumThreadsForTargetDirective(
llvm::Value *CondVal = nullptr;
llvm::Value *ThreadLimitVal = nullptr;
const Expr *ThreadLimitExpr = nullptr;
- uint32_t UpperBound = -1;
+ int32_t UpperBound = -1;
const Expr *NT = getNumThreadsExprForTargetDirective(
CGF, D, UpperBound, /* UpperBoundOnly */ false, &CondVal,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index d2f922da3320924..0c4ad46e881b9c5 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -311,6 +311,14 @@ class CGOpenMPRuntime {
/// An OpenMP-IR-Builder instance.
llvm::OpenMPIRBuilder OMPBuilder;
+ /// Helper to determine the min/max number of threads/teams for \p D.
+ void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D,
+ CodeGenFunction &CGF,
+ int32_t &MinThreadsVal,
+ int32_t &MaxThreadsVal,
+ int32_t &MinTeamsVal,
+ int32_t &MaxTeamsVal);
+
/// Helper to emit outlined function for 'target' directive.
/// \param D Directive to emit.
/// \param ParentName Name of the function that encloses the target region.
@@ -649,7 +657,7 @@ class CGOpenMPRuntime {
/// UpperBoundOnly is true, no expression evaluation is perfomed.
const Expr *getNumThreadsExprForTargetDirective(
CodeGenFunction &CGF, const OMPExecutableDirective &D,
- uint32_t &UpperBound, bool UpperBoundOnly,
+ int32_t &UpperBound, bool UpperBoundOnly,
llvm::Value **CondExpr = nullptr, const Expr **ThreadLimitExpr = nullptr);
/// Emit an expression that denotes the number of threads a target region
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 152a7511f4dd1b0..9d00ebae702802a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -757,13 +757,15 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
+ const OMPExecutableDirective &D;
public:
- NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
- : EST(EST) {}
+ NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
+ const OMPExecutableDirective &D)
+ : EST(EST), D(D) {}
void Enter(CodeGenFunction &CGF) override {
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
+ RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
@@ -772,7 +774,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
}
- } Action(EST);
+ } Action(EST, D);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
@@ -780,10 +782,17 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
IsInTTDRegion = false;
}
-void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
+void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
+ CodeGenFunction &CGF,
EntryFunctionState &EST, bool IsSPMD) {
+ int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
+ MaxTeamsVal = -1;
+ computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
+ MinTeamsVal, MaxTeamsVal);
+
CGBuilderTy &Bld = CGF.Builder;
- Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
+ Bld.restoreIP(OMPBuilder.createTargetInit(
+ Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
if (!IsSPMD)
emitGenericVarsProlog(CGF, EST.Loc);
}
@@ -815,19 +824,20 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
bool IsBareKernel;
DataSharingMode Mode;
+ const OMPExecutableDirective &D;
public:
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
CGOpenMPRuntimeGPU::EntryFunctionState &EST,
- bool IsBareKernel)
+ bool IsBareKernel, const OMPExecutableDirective &D)
: RT(RT), EST(EST), IsBareKernel(IsBareKernel),
- Mode(RT.CurrentDataSharingMode) {}
+ Mode(RT.CurrentDataSharingMode), D(D) {}
void Enter(CodeGenFunction &CGF) override {
if (IsBareKernel) {
RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
return;
}
- RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
+ RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
@@ -839,7 +849,7 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
}
- } Action(*this, EST, IsBareKernel);
+ } Action(*this, EST, IsBareKernel, D);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index c4501a1a2a496b0..46e1361f2f895ba 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -60,8 +60,8 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
void syncCTAThreads(CodeGenFunction &CGF);
/// Helper for target directive initialization.
- void emitKernelInit(CodeGenFunction &CGF, EntryFunctionState &EST,
- bool IsSPMD);
+ void emitKernelInit(const OMPExecutableDirective &D, CodeGenFunction &CGF,
+ EntryFunctionState &EST, bool IsSPMD);
/// Helper for target directive finalization.
void emitKernelDeinit(CodeGenFunction &CGF, EntryFunctionState &EST,
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index 297f508575d99d9..f74abbe32e454f6 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -220,7 +220,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
// CHECK1-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
// CHECK1: omp_offload.failed:
-// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4:[0-9]+]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]]
// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
// CHECK1: omp_offload.cont:
// CHECK1-NEXT: ret void
@@ -242,7 +242,7 @@ int fint(void) { return ftemplate<int>(); }
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -291,45 +291,45 @@ int fint(void) { return ftemplate<int>(); }
// CHECK1-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK1: omp.inner.for.cond:
-// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
-// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8:![0-9]+]]
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
// CHECK1: omp.inner.for.body:
-// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
-// CHECK1-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[IDXPROM2:%.*]] = sext i32 [[TMP17]] to i64
// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM2]]
-// CHECK1-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[MUL4:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[IDXPROM5:%.*]] = sext i32 [[TMP20]] to i64
// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM5]]
-// CHECK1-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP21]]
-// CHECK1-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP23]] to i64
// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM8]]
-// CHECK1-NEXT: store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
// CHECK1: omp.body.continue:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK1: omp.inner.for.inc:
-// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[ADD10:%.*]] = add...
[truncated]
|
f6d6661
to
f6c49fc
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lots of churn but looks straightforward enough. Few nits.
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
Outdated
Show resolved
Hide resolved
996a9b0
to
09f82be
Compare
By associating the kernel environment with the generic kernel we can access middle-end information easily, including the launch bounds ranges that are acceptable. By constraining the number of threads accordingly, we now obey the user provided bounds that were passed via attributes.
09f82be
to
fa6d6d9
Compare
merges to d346c82 [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (llvm#70383) and reverts to buy time to inegrate it in properly. Also brings in 8f5a18b6e58 Perf/lexer faster slow get char and size (llvm#70543) which is reverted upstream 9 commits later. Change-Id: Idb81b74c26b78bff856d7e4119036918629e47f9
…vm#70383) By associating the kernel environment with the generic kernel we can access middle-end information easily, including the launch bounds ranges that are acceptable. By constraining the number of threads accordingly, we now obey the user-provided bounds that were passed via attributes. Change-Id: I5f27ab4719d72b9a0777798d8225220f6b00b778
By associating the kernel environment with the generic kernel we can
access middle-end information easily, including the launch bounds ranges
that are acceptable. By constraining the number of threads accordingly,
we now obey the user-provided bounds that were passed via attributes.