Skip to content

Commit

Permalink
[RISCV][CostModel] Refine Arithmetic reduction costs (#79103)
Browse files Browse the repository at this point in the history
This patch is split off from #77342

- Correct for CodeSize cost that 1 instruction is not included. 3 is
from {VMV.S, ReductionOp, VMV.X}
- Add SplitCost
Unordered reduction chain a series of VADD/VFADD/... which scales with
LMUL.
 Ordered reductions chain a series of VFREDOSUMs.
- Use MVT to estimate VL.
  • Loading branch information
arcbbb committed Jan 25, 2024
1 parent f59eef6 commit 84be954
Show file tree
Hide file tree
Showing 9 changed files with 361 additions and 307 deletions.
78 changes: 66 additions & 12 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
}
case RISCV::VMV_X_S:
case RISCV::VMV_S_X:
case RISCV::VFMV_F_S:
case RISCV::VFMV_S_F:
case RISCV::VMNAND_MM:
case RISCV::VCPOP_M:
Cost += 1;
break;
default:
Expand Down Expand Up @@ -966,20 +970,70 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
if (Ty->getElementType()->isIntegerTy(1))
// vcpop sequences, see vreduction-mask.ll
return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
SmallVector<unsigned, 3> Opcodes;
Type *ElementTy = Ty->getElementType();
if (ElementTy->isIntegerTy(1)) {
if (ISD == ISD::AND) {
// Example sequences:
// vsetvli a0, zero, e8, mf8, ta, ma
// vmnot.m v8, v0
// vcpop.m a0, v8
// seqz a0, a0
Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
return (LT.first - 1) +
getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
CmpInst::ICMP_EQ, CostKind);
} else {
// Example sequences:
// vsetvli a0, zero, e8, mf8, ta, ma
// vcpop.m a0, v0
// snez a0, a0
Opcodes = {RISCV::VCPOP_M};
return (LT.first - 1) +
getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
CmpInst::ICMP_NE, CostKind);
}
}

// IR Reduction is composed by two vmv and one rvv reduction instruction.
InstructionCost BaseCost = 2;

if (CostKind == TTI::TCK_CodeSize)
return (LT.first - 1) + BaseCost;

unsigned VL = getEstimatedVLFor(Ty);
if (TTI::requiresOrderedReduction(FMF))
return (LT.first - 1) + BaseCost + VL;
return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
if (TTI::requiresOrderedReduction(FMF)) {
Opcodes.push_back(RISCV::VFMV_S_F);
for (unsigned i = 0; i < LT.first.getValue(); i++)
Opcodes.push_back(RISCV::VFREDOSUM_VS);
Opcodes.push_back(RISCV::VFMV_F_S);
return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
}
unsigned SplitOp;
switch (ISD) {
case ISD::ADD:
SplitOp = RISCV::VADD_VV;
Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
break;
case ISD::OR:
SplitOp = RISCV::VOR_VV;
Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
break;
case ISD::XOR:
SplitOp = RISCV::VXOR_VV;
Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
break;
case ISD::AND:
SplitOp = RISCV::VAND_VV;
Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
break;
case ISD::FADD:
SplitOp = RISCV::VFADD_VV;
Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
break;
}
// Add a cost for data larger than LMUL8
InstructionCost SplitCost =
(LT.first > 1) ? (LT.first - 1) *
getRISCVInstructionCost(SplitOp, LT.second, CostKind)
: 0;
return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
}

InstructionCost RISCVTTIImpl::getExtendedReductionCost(
Expand Down
70 changes: 35 additions & 35 deletions llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@ define i32 @reduce_i8(i32 %arg) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SIZE-LABEL: 'reduce_i8'
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
Expand All @@ -85,14 +85,14 @@ define i32 @reduce_i16(i32 %arg) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SIZE-LABEL: 'reduce_i16'
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
Expand All @@ -115,18 +115,18 @@ define i32 @reduce_i32(i32 %arg) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SIZE-LABEL: 'reduce_i32'
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
Expand All @@ -148,19 +148,19 @@ define i32 @reduce_i64(i32 %arg) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SIZE-LABEL: 'reduce_i64'
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
Expand Down

0 comments on commit 84be954

Please sign in to comment.