Skip to content

Commit

Permalink
[RISCV][CostModel] Estimate cost of llvm.vector.reduce.fmaximum/fmini…
Browse files Browse the repository at this point in the history
…mum (#80697)

The ‘llvm.vector.reduce.fmaximum/fminimum.*’ intrinsics propagate NaNs
if any element of the vector is a NaN.
Following #79402, the patch adds the cost for NaN check (vmfne + vcpop)
  • Loading branch information
arcbbb committed Mar 25, 2024
1 parent cbbbab3 commit 3cb0241
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 52 deletions.
45 changes: 45 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,51 @@ RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
}

if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
SmallVector<unsigned, 3> Opcodes;
InstructionCost ExtraCost = 0;
switch (IID) {
case Intrinsic::maximum:
if (FMF.noNaNs()) {
Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
} else {
Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
RISCV::VFMV_F_S};
// Cost of Canonical Nan + branch
// lui a0, 523264
// fmv.w.x fa0, a0
Type *DstTy = Ty->getScalarType();
const unsigned EltTyBits = DstTy->getScalarSizeInBits();
Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
ExtraCost = 1 +
getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
TTI::CastContextHint::None, CostKind) +
getCFInstrCost(Instruction::Br, CostKind);
}
break;

case Intrinsic::minimum:
if (FMF.noNaNs()) {
Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
} else {
Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
RISCV::VFMV_F_S};
// Cost of Canonical Nan + branch
// lui a0, 523264
// fmv.w.x fa0, a0
Type *DstTy = Ty->getScalarType();
const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
ExtraCost = 1 +
getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
TTI::CastContextHint::None, CostKind) +
getCFInstrCost(Instruction::Br, CostKind);
}
break;
}
return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
}

// IR Reduction is composed by two vmv and one rvv reduction instruction.
InstructionCost BaseCost = 2;

Expand Down
91 changes: 65 additions & 26 deletions llvm/test/Analysis/CostModel/RISCV/reduce-fmaximum.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,37 @@

define float @reduce_fmaximum_f32(float %arg) {
; CHECK-LABEL: 'reduce_fmaximum_f32'
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %4 = call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %6 = call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %7 = call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float undef
;
; SIZE-LABEL: 'reduce_fmaximum_f32'
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef
;
%V2 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
Expand All @@ -32,6 +46,13 @@ define float @reduce_fmaximum_f32(float %arg) {
%V32 = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
%V64 = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
%V128 = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
call fast float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> undef)
call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> undef)
call fast float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> undef)
call fast float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> undef)
call fast float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> undef)
call fast float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> undef)
call fast float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> undef)
ret float undef
}
declare float @llvm.vector.reduce.fmaximum.v2f32(<2 x float>)
Expand All @@ -44,21 +65,33 @@ declare float @llvm.vector.reduce.fmaximum.v128f32(<128 x float>)

define double @reduce_fmaximum_f64(double %arg) {
; CHECK-LABEL: 'reduce_fmaximum_f64'
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %4 = call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %6 = call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double undef
;
; SIZE-LABEL: 'reduce_fmaximum_f64'
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef
;
%V2 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
Expand All @@ -67,6 +100,12 @@ define double @reduce_fmaximum_f64(double %arg) {
%V16 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
%V32 = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
%V64 = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> undef)
call fast double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> undef)
call fast double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> undef)
call fast double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> undef)
call fast double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> undef)
call fast double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> undef)
ret double undef
}
declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>)
Expand Down
Loading

0 comments on commit 3cb0241

Please sign in to comment.