Skip to content

Commit

Permalink
[RISCV] Add basic code modeling for fixed length vector reduction.
Browse files Browse the repository at this point in the history
Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D121447
  • Loading branch information
yetingk committed Mar 14, 2022
1 parent eeb3bfd commit ae7c664
Show file tree
Hide file tree
Showing 9 changed files with 1,589 additions and 4 deletions.
58 changes: 58 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Expand Up @@ -217,6 +217,64 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
return NumLoads * MemOpCost;
}

InstructionCost
RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsUnsigned,
TTI::TargetCostKind CostKind) {
// FIXME: Only supporting fixed vectors for now.
if (!isa<FixedVectorType>(Ty))
return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);

if (!ST->useRVVForFixedLengthVectors())
return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);

// Skip if scalar size of Ty is bigger than ELEN.
if (Ty->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);

// IR Reduction is composed by two vmv and one rvv reduction instruction.
InstructionCost BaseCost = 2;
unsigned VL = cast<FixedVectorType>(Ty)->getNumElements();
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
}

InstructionCost
RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *VTy,
Optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) {
// FIXME: Only supporting fixed vectors for now.
if (!isa<FixedVectorType>(VTy))
return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);

// FIXME: Do not support i1 and/or reduction now.
if (VTy->getElementType()->isIntegerTy(1))
return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);

if (!ST->useRVVForFixedLengthVectors())
return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);

// Skip if scalar size of VTy is bigger than ELEN.
if (VTy->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);

int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");

if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
ISD != ISD::FADD)
return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);

// IR Reduction is composed by two vmv and one rvv reduction instruction.
InstructionCost BaseCost = 2;
unsigned VL = cast<FixedVectorType>(VTy)->getNumElements();
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VTy);

if (TTI::requiresOrderedReduction(FMF))
return (LT.first - 1) + BaseCost + VL;
return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
}

void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) {
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
Expand Up @@ -88,6 +88,14 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
TTI::TargetCostKind CostKind,
const Instruction *I);

InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsUnsigned,
TTI::TargetCostKind CostKind);

InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
Optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind);

bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
if (!ST->hasVInstructions())
return false;
Expand Down
172 changes: 172 additions & 0 deletions llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
@@ -0,0 +1,172 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=256 -passes='print<cost-model>' -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefix=RISCV32
; RUN: opt < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 -passes='print<cost-model>' -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefix=RISCV64

define i32 @reduce_i8(i32 %arg) {
; RISCV32-LABEL: 'reduce_i8'
; RISCV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; RISCV64-LABEL: 'reduce_i8'
; RISCV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
%V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
%V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
%V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
%V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
%V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
%V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
%V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
ret i32 undef
}

define i32 @reduce_i16(i32 %arg) {
; RISCV32-LABEL: 'reduce_i16'
; RISCV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; RISCV64-LABEL: 'reduce_i16'
; RISCV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
%V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
%V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
%V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
%V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
%V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
%V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
%V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
ret i32 undef
}

define i32 @reduce_i32(i32 %arg) {
; RISCV32-LABEL: 'reduce_i32'
; RISCV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; RISCV64-LABEL: 'reduce_i32'
; RISCV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
%V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
%V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
%V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
%V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
%V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
%V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
%V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
ret i32 undef
}

define i32 @reduce_i64(i32 %arg) {
; RISCV32-LABEL: 'reduce_i64'
; RISCV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
; RISCV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; RISCV64-LABEL: 'reduce_i64'
; RISCV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
; RISCV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
%V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
%V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
%V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
%V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
%V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
%V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
%V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
%V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
ret i32 undef
}

declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
declare i32 @llvm.vector.reduce.add.v128i32(<128 x i32>)
declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
declare i64 @llvm.vector.reduce.add.v128i64(<128 x i64>)

0 comments on commit ae7c664

Please sign in to comment.