-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[Analysis][AArch64] Add cost model for loop.dependence.{war/raw}.mask #167551
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This PR adds the cost model for the loop dependence mask intrinsics, both for cases where they must be expanded and when they can be lowered for AArch64.
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-llvm-analysis Author: Sam Tebbs (SamTebbs33) ChangesThis PR adds the cost model for the loop dependence mask intrinsics, both for cases where they must be expanded and when they can be lowered for AArch64. Full diff: https://github.com/llvm/llvm-project/pull/167551.diff 3 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 1c167af4b0478..c1b0dc5db9607 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2190,6 +2190,53 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Otherwise, fallback to default scalarization cost.
break;
}
+ case Intrinsic::loop_dependence_raw_mask:
+ case Intrinsic::loop_dependence_war_mask: {
+ InstructionCost Cost = 0;
+ Type *PtrTy = ICA.getArgTypes()[0];
+ bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
+
+ Cost +=
+ thisT()->getArithmeticInstrCost(Instruction::Sub, PtrTy, CostKind);
+ if (IsReadAfterWrite) {
+ IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, PtrTy, {PtrTy}, {});
+ Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
+ }
+
+ Cost +=
+ thisT()->getArithmeticInstrCost(Instruction::SDiv, PtrTy, CostKind);
+ Type *CmpTy =
+ getTLI()
+ ->getSetCCResultType(
+ thisT()->getDataLayout(), RetTy->getContext(),
+ getTLI()->getValueType(thisT()->getDataLayout(), PtrTy))
+ .getTypeForEVT(RetTy->getContext());
+ Cost += thisT()->getCmpSelInstrCost(
+ BinaryOperator::ICmp, CmpTy, PtrTy,
+ IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE, CostKind);
+
+ // The deconstructed active lane mask
+ VectorType *RetTyVec = cast<VectorType>(RetTy);
+ VectorType *SplatTy = cast<VectorType>(RetTyVec->getWithNewType(PtrTy));
+ Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SplatTy, SplatTy, {},
+ CostKind, 0, nullptr);
+ IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, SplatTy, {},
+ FMF);
+ Cost += thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SplatTy,
+ SplatTy, CmpInst::ICMP_ULT, CostKind);
+
+ Cost +=
+ thisT()->getCastInstrCost(Instruction::CastOps::ZExt, RetTy, SplatTy,
+ TTI::CastContextHint::None, CostKind);
+ Cost += thisT()->getCastInstrCost(Instruction::CastOps::ZExt,
+ RetTyVec->getElementType(), CmpTy,
+ TTI::CastContextHint::None, CostKind);
+ Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, RetTyVec, RetTyVec, {},
+ CostKind, 0, nullptr);
+ Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
+ return Cost;
+ }
}
// Assume that we need to scalarize this intrinsic.)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 197aae6e03cb1..c4bd3c7803c1a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1032,6 +1032,40 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::loop_dependence_raw_mask:
+ case Intrinsic::loop_dependence_war_mask: {
+ auto *EltSize = cast<ConstantInt>(ICA.getArgs()[2]);
+ EVT VecVT = getTLI()->getValueType(DL, RetTy);
+ // An invalid element size and return type combination must be expanded.
+ bool MustBeExpanded = false;
+ switch (EltSize->getSExtValue()) {
+ case 1:
+ if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1)
+ MustBeExpanded = true;
+ break;
+ case 2:
+ if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1)
+ MustBeExpanded = true;
+ break;
+ case 4:
+ if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1)
+ MustBeExpanded = true;
+ break;
+ case 8:
+ if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1)
+ MustBeExpanded = true;
+ break;
+ default:
+ MustBeExpanded = true;
+ // Other element sizes are incompatible with whilewr/rw, so expand instead
+ break;
+ }
+
+ // The whilewr/rw instructions require SVE2 or SME
+ if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME()))
+ break;
+ return 1;
+ }
case Intrinsic::experimental_vector_extract_last_active:
if (ST->isSVEorStreamingSVEAvailable()) {
auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
diff --git a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
new file mode 100644
index 0000000000000..1074d41d994d9
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-EXPANDED
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sme | FileCheck %s --check-prefix=CHECK
+
+; loop.dependence.{war,raw}.mask can be lowered to while{wr,rw} if SVE2 or SME is enabled.
+define void @loop_dependence_war_mask(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask'
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_war_mask'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+ %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+ %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+ %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+ %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+ ret void
+}
+
+define void @loop_dependence_raw_mask(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask'
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_raw_mask'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+ %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+ %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+ %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+ %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+ ret void
+}
+
+; Invalid element size and return type combinations must be expanded, even with sve2/sme
+define void @loop_dependence_war_mask_invalid(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask_invalid'
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_war_mask_invalid'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+ %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+ %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+ %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+ %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+ %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+ ret void
+}
+
+define void @loop_dependence_raw_mask_invalid(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask_invalid'
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_raw_mask_invalid'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+ %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+ %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+ %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+ %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+ %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+ ret void
+}
|
| } | ||
| case Intrinsic::loop_dependence_raw_mask: | ||
| case Intrinsic::loop_dependence_war_mask: { | ||
| InstructionCost Cost = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a comment that explains why the cost is broken down like this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. I hope that's comprehensive enough.
| bool MustBeExpanded = false; | ||
| switch (EltSize->getSExtValue()) { | ||
| case 1: | ||
| if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1) | ||
| MustBeExpanded = true; | ||
| break; | ||
| case 2: | ||
| if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1) | ||
| MustBeExpanded = true; | ||
| break; | ||
| case 4: | ||
| if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1) | ||
| MustBeExpanded = true; | ||
| break; | ||
| case 8: | ||
| if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1) | ||
| MustBeExpanded = true; | ||
| break; | ||
| default: | ||
| MustBeExpanded = true; | ||
| // Other element sizes are incompatible with whilewr/rw, so expand instead | ||
| break; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: the switch looks a little verbose, this could be written as:
unsigned EltSize = cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
EVT VecVT = getTLI()->getValueType(DL, RetTy);
bool Expand = VecVT.getVectorMinNumElements() != (16 / EltSize) ||
!isPowerOf2_32(EltSize);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: EltSize -> EltSizeInBytes (just so the units are clear).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks both.
|
Ping |
This PR adds the cost model for the loop dependence mask intrinsics, both for cases where they must be expanded and when they can be lowered for AArch64.