Skip to content

Commit

Permalink
[ARM] Improve costs for FMin/Max reductions
Browse files Browse the repository at this point in the history
Similar to the other reductions, this changes the cost of fmin/fmax reductions
under MVE/NEON to perform vector operations until the types need to be
scalarized. The fp16 vectors can perform a VREV+FMIN/FMAX to skip a step of the
reduction, and otherwise need lanewise extract fro the top lanes.
  • Loading branch information
davemgreen committed Sep 4, 2023
1 parent 3dcf3cb commit 2955cc1
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 52 deletions.
44 changes: 44 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1785,6 +1785,50 @@ ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
}

InstructionCost
ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
FastMathFlags FMF,
TTI::TargetCostKind CostKind) {
EVT ValVT = TLI->getValueType(DL, Ty);

// In general floating point reductions are a series of elementwise
// operations, with free extracts on each step. These are either in-order or
// treewise depending on whether that is allowed by the fast math flags.
if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
(ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
(ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
unsigned EltSize = ValVT.getScalarSizeInBits();
unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
InstructionCost VecCost;
while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
VecCost += getIntrinsicInstrCost(ICA, CostKind);
NumElts /= 2;
}

// For fp16 we need to extract the upper lane elements. MVE can add a
// VREV+FMIN/MAX to perform another vector step instead.
InstructionCost ExtractCost = 0;
if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
NumElts == 8) {
VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
NumElts /= 2;
} else if (ValVT.getVectorElementType() == MVT::f16)
ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;

IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
{Ty->getElementType(), Ty->getElementType()},
FMF);
return VecCost + ExtractCost +
(NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
}

return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
}

InstructionCost
ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,10 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
VectorType *ValTy,
TTI::TargetCostKind CostKind);

InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
FastMathFlags FMF,
TTI::TargetCostKind CostKind);

InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);

Expand Down
8 changes: 4 additions & 4 deletions llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
Original file line number Diff line number Diff line change
Expand Up @@ -294,19 +294,19 @@ define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {

define void @reduce_fmax(<16 x float> %va) {
; THRU-LABEL: 'reduce_fmax'
; THRU-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; THRU-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; LATE-LABEL: 'reduce_fmax'
; LATE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; LATE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE-LABEL: 'reduce_fmax'
; SIZE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE_LATE-LABEL: 'reduce_fmax'
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
Expand Down

0 comments on commit 2955cc1

Please sign in to comment.