Skip to content

Commit

Permalink
[SLP] Add cost model for llvm.powi.* intrinsics (REAPPLIED)
Browse files Browse the repository at this point in the history
Patch was reverted in 4c5f10a due to buildbot failures, now being
reapplied with updated AArch64 and RISCV tests.

This patch adds handling for the llvm.powi.* intrinsics in
BasicTTIImplBase::getIntrinsicInstrCost() and improves vectorization.
Closes #53887.

Differential Revision: https://reviews.llvm.org/D128172
  • Loading branch information
omern1 committed Jun 24, 2022
1 parent 91d61c1 commit 0d41794
Show file tree
Hide file tree
Showing 8 changed files with 242 additions and 520 deletions.
20 changes: 20 additions & 0 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Expand Up @@ -1417,6 +1417,26 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
default:
break;

case Intrinsic::powi:
if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
ShouldOptForSize)) {
// The cost is modeled on the expansion performed by ExpandPowI in
// SelectionDAGBuilder.
APInt Exponent = RHSC->getValue().abs();
unsigned ActiveBits = Exponent.getActiveBits();
unsigned PopCount = Exponent.countPopulation();
InstructionCost Cost = (ActiveBits + PopCount - 2) *
thisT()->getArithmeticInstrCost(
Instruction::FMul, RetTy, CostKind);
if (RHSC->getSExtValue() < 0)
Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
CostKind);
return Cost;
}
}
break;
case Intrinsic::cttz:
// FIXME: If necessary, this should go in target-specific overrides.
if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
Expand Down
12 changes: 12 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Expand Up @@ -2196,6 +2196,18 @@ class TargetLoweringBase {
return false;
}

/// Return true if it is beneficial to expand an @llvm.powi.* intrinsic.
/// If not optimizing for size, expanding @llvm.powi.* intrinsics is always
/// considered beneficial.
/// If optimizing for size, expansion is only considered beneficial for upto
/// 5 multiplies and a divide (if the exponent is negative).
bool isBeneficialToExpandPowI(int Exponent, bool OptForSize) const {
if (Exponent < 0)
Exponent = -Exponent;
return !OptForSize ||
(countPopulation((unsigned int)Exponent) + Log2_32(Exponent) < 7);
}

//===--------------------------------------------------------------------===//
// TargetLowering Configuration Methods - These methods should be invoked by
// the derived class constructor to configure this object for the target.
Expand Down
26 changes: 12 additions & 14 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Expand Up @@ -5346,38 +5346,36 @@ static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
/// ExpandPowI - Expand a llvm.powi intrinsic.
static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
SelectionDAG &DAG) {
// If RHS is a constant, we can expand this out to a multiplication tree,
// otherwise we end up lowering to a call to __powidf2 (for example). When
// optimizing for size, we only want to do this if the expansion would produce
// a small number of multiplies, otherwise we do the full expansion.
// If RHS is a constant, we can expand this out to a multiplication tree if
// it's beneficial on the target, otherwise we end up lowering to a call to
// __powidf2 (for example).
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
// Get the exponent as a positive value.
unsigned Val = RHSC->getSExtValue();
if ((int)Val < 0) Val = -Val;

// powi(x, 0) -> 1.0
if (Val == 0)
return DAG.getConstantFP(1.0, DL, LHS.getValueType());

bool OptForSize = DAG.shouldOptForSize();
if (!OptForSize ||
// If optimizing for size, don't insert too many multiplies.
// This inserts up to 5 multiplies.
countPopulation(Val) + Log2_32(Val) < 7) {
if (DAG.getTargetLoweringInfo().isBeneficialToExpandPowI(
Val, DAG.shouldOptForSize())) {
// Get the exponent as a positive value.
if ((int)Val < 0)
Val = -Val;
// We use the simple binary decomposition method to generate the multiply
// sequence. There are more optimal ways to do this (for example,
// powi(x,15) generates one more multiply than it should), but this has
// the benefit of being both really simple and much better than a libcall.
SDValue Res; // Logically starts equal to 1.0
SDValue Res; // Logically starts equal to 1.0
SDValue CurSquare = LHS;
// TODO: Intrinsics should have fast-math-flags that propagate to these
// nodes.
while (Val) {
if (Val & 1) {
if (Res.getNode())
Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
Res =
DAG.getNode(ISD::FMUL, DL, Res.getValueType(), Res, CurSquare);
else
Res = CurSquare; // 1.0*CurSquare.
Res = CurSquare; // 1.0*CurSquare.
}

CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
Expand Down
15 changes: 12 additions & 3 deletions llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
Expand Up @@ -225,12 +225,12 @@ declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x
declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)

define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
; CHECK-LABEL: 'unsupported_fp_ops'
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
Expand All @@ -242,7 +242,7 @@ define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
%sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
%cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
%pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
%powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
%powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
%exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
%exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
%log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
Expand All @@ -251,6 +251,15 @@ define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
ret void
}

define void @powi(<vscale x 4 x float> %vec) {
; CHECK-LABEL: 'powi'
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
ret void
}

declare <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float>)
declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
Expand Down
15 changes: 12 additions & 3 deletions llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt < %s -passes='print<cost-model>' 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v | FileCheck %s

define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
; CHECK-LABEL: 'unsupported_fp_ops'
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
Expand All @@ -20,7 +20,7 @@ define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
%sin = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %vec)
%cos = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %vec)
%pow = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x float> %vec)
%powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
%powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 %extraarg)
%exp = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %vec)
%exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %vec)
%log = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %vec)
Expand All @@ -31,6 +31,15 @@ define void @unsupported_fp_ops(<vscale x 4 x float> %vec) {
ret void
}

define void @powi(<vscale x 4 x float> %vec) {
; CHECK-LABEL: 'powi'
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
ret void
}

define void @fshr(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c) {
; CHECK-LABEL: 'fshr'
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %1 = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
Expand Down

0 comments on commit 0d41794

Please sign in to comment.