diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 9776c20400d6fd..3f016d85d8ede3 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1288,15 +1288,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { case Intrinsic::vector_reduce_fmin: case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umin: { - if (isa(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::vector_reduce_fadd: case Intrinsic::vector_reduce_fmul: { - if (isa(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); IntrinsicCostAttributes Attrs( IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 68d382fb784bb3..ffa045846e59c0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1096,11 +1096,70 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, return false; } +int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind) { + if (!isa(Ty)) + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, + CostKind); + assert((isa(Ty) && isa(CondTy)) && + "Both vector needs to be scalable"); + + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + int LegalizationCost = 0; + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); + unsigned CmpOpcode = + Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp; + LegalizationCost = + getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind) + + getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + LegalizationCost *= LT.first - 1; + } + + return LegalizationCost + /*Cost of horizontal reduction*/ 2; +} + +int AArch64TTIImpl::getArithmeticReductionCostSVE( + unsigned Opcode, VectorType *ValTy, bool IsPairwise, + TTI::TargetCostKind CostKind) { + assert(!IsPairwise && "Cannot be pair wise to continue"); + + std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + int LegalizationCost = 0; + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); + LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); + LegalizationCost *= LT.first - 1; + } + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + // Add the final reduction cost for the legal horizontal reduction + switch (ISD) { + case ISD::ADD: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::FADD: + return LegalizationCost + 2; + default: + // TODO: Replace for invalid when InstructionCost is used + // cases not supported by SVE + return 16; + } +} + int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, TTI::TargetCostKind CostKind) { + if (isa(ValTy)) + return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm, + CostKind); if (IsPairwiseForm) return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, CostKind); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 7dded02b2a6f78..7c9360ada92eb8 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -139,6 +139,14 @@ class AArch64TTIImpl : public BasicTTIImplBase { int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind); + + int getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind); + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll new file mode 100644 index 00000000000000..486e7aaac68aa4 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll @@ -0,0 +1,251 @@ +; Check getIntrinsicInstrCost in BasicTTIImpl.h with SVE for vector.reduce. +; Checks legal and not legal vector size + +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s + + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define i32 @add.i32.nxv4i32( %v) { +; CHECK-LABEL: 'add.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.add.nxv4i32( %v) +; CHECK-NEXT:Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.add.nxv4i32( %v) + ret i32 %r +} + +define i64 @add.i64.nxv4i64( %v) { +; CHECK-LABEL: 'add.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = call i64 @llvm.vector.reduce.add.nxv4i64( %v) +; CHECK-NEXT:Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.add.nxv4i64( %v) + ret i64 %r +} + +define i32 @mul.i32.nxv4i32( %v) { +; CHECK-LABEL: 'mul.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r = call i32 @llvm.vector.reduce.mul.nxv4i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.mul.nxv4i32( %v) + ret i32 %r +} + +define i64 @mul.i64.nxv4i64( %v) { +; CHECK-LABEL: 'mul.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r = call i64 @llvm.vector.reduce.mul.nxv4i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.mul.nxv4i64( %v) + ret i64 %r +} + +define i32 @and.i32.nxv4i32( %v) { +; CHECK-LABEL: 'and.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.and.nxv4i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.and.nxv4i32( %v) + ret i32 %r +} + +define i64 @and.i64.nxv4i64( %v) { +; CHECK-LABEL: 'and.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = call i64 @llvm.vector.reduce.and.nxv4i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.and.nxv4i64( %v) + ret i64 %r +} + +define i32 @or.i32.nxv4i32( %v) { +; CHECK-LABEL: 'or.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.or.nxv4i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.or.nxv4i32( %v) + ret i32 %r +} + +define i64 @or.i64.nxv4i64( %v) { +; CHECK-LABEL: 'or.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = call i64 @llvm.vector.reduce.or.nxv4i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.or.nxv4i64( %v) + ret i64 %r +} + +define i32 @xor.i32.nxv4i32( %v) { +; CHECK-LABEL: 'xor.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.xor.nxv4i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.xor.nxv4i32( %v) + ret i32 %r +} + +define i64 @xor.i64.nxv4i64( %v) { +; CHECK-LABEL: 'xor.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = call i64 @llvm.vector.reduce.xor.nxv4i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.xor.nxv4i64( %v) + ret i64 %r +} + +define i32 @umin.i32.nxv4i32( %v) { +; CHECK-LABEL: 'umin.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.umin.nxv4i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.umin.nxv4i32( %v) + ret i32 %r +} + +define i64 @umin.i64.nxv4i64( %v) { +; CHECK-LABEL: 'umin.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i64 @llvm.vector.reduce.umin.nxv4i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.umin.nxv4i64( %v) + ret i64 %r +} + +define float @fmax.f32.nxv4f32( %v) { +; CHECK-LABEL: 'fmax.f32.nxv4f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call float @llvm.vector.reduce.fmax.nxv4f32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmax.nxv4f32( %v) + ret float %r +} + +define double @fmax.f64.nxv4f64( %v) { +; CHECK-LABEL: 'fmax.f64.nxv4f64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call double @llvm.vector.reduce.fmax.nxv4f64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r + + %r = call double @llvm.vector.reduce.fmax.nxv4f64( %v) + ret double %r +} + +define float @fmin.f32.nxv4f32( %v) { +; CHECK-LABEL: 'fmin.f32.nxv4f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call float @llvm.vector.reduce.fmin.nxv4f32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmin.nxv4f32( %v) + ret float %r +} + +define double @fmin.f64.nxv4f64( %v) { +; CHECK-LABEL: 'fmin.f64.nxv4f64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call double @llvm.vector.reduce.fmin.nxv4f64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r + + %r = call double @llvm.vector.reduce.fmin.nxv4f64( %v) + ret double %r +} + +define i32 @umax.i32.nxv4i32( %v) { +; CHECK-LABEL: 'umax.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.umax.nxv4i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.umax.nxv4i32( %v) + ret i32 %r +} + +define i64 @umax.i64.nxv4i64( %v) { +; CHECK-LABEL: 'umax.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i64 @llvm.vector.reduce.umax.nxv4i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.umax.nxv4i64( %v) + ret i64 %r +} + +define i32 @smin.i32.nxv4i32( %v) { +; CHECK-LABEL: 'smin.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.smin.nxv4i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.smin.nxv4i32( %v) + ret i32 %r +} + +define i64 @smin.i64.nxv4i64( %v) { +; CHECK-LABEL: 'smin.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i64 @llvm.vector.reduce.smin.nxv4i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.smin.nxv4i64( %v) + ret i64 %r +} + +define i32 @smax.i32.nxv4i32( %v) { +; CHECK-LABEL: 'smax.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.smax.nxv4i32( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.smax.nxv4i32( %v) + ret i32 %r +} + +define i64 @smax.i64.nxv4i64( %v) { +; CHECK-LABEL: 'smax.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i64 @llvm.vector.reduce.smax.nxv4i64( %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.smax.nxv4i64( %v) + ret i64 %r +} + +define float @fadda_nxv4f32(float %start, %a) #0 { +; CHECK-LABEL: 'fadda_nxv4f32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call float @llvm.vector.reduce.fadd.nxv4f32(float %start, %a) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %res + + %res = call float @llvm.vector.reduce.fadd.nxv4f32(float %start, %a) + ret float %res +} + +define double @fadda_nxv4f64(double %start, %a) #0 { +; CHECK-LABEL: 'fadda_nxv4f64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call double @llvm.vector.reduce.fadd.nxv4f64(double %start, %a) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res + + %res = call double @llvm.vector.reduce.fadd.nxv4f64(double %start, %a) + ret double %res +} + + +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i32 @llvm.vector.reduce.mul.nxv4i32() +declare i32 @llvm.vector.reduce.and.nxv4i32() +declare i32 @llvm.vector.reduce.or.nxv4i32() +declare i32 @llvm.vector.reduce.xor.nxv4i32() +declare float @llvm.vector.reduce.fmax.nxv4f32() +declare float @llvm.vector.reduce.fmin.nxv4f32() +declare i32 @llvm.vector.reduce.fmin.nxv4i32() +declare i32 @llvm.vector.reduce.umin.nxv4i32() +declare i32 @llvm.vector.reduce.umax.nxv4i32() +declare i32 @llvm.vector.reduce.smin.nxv4i32() +declare i32 @llvm.vector.reduce.smax.nxv4i32() +declare float @llvm.vector.reduce.fadd.nxv4f32(float, ) +declare i64 @llvm.vector.reduce.add.nxv4i64() +declare i64 @llvm.vector.reduce.mul.nxv4i64() +declare i64 @llvm.vector.reduce.and.nxv4i64() +declare i64 @llvm.vector.reduce.or.nxv4i64() +declare i64 @llvm.vector.reduce.xor.nxv4i64() +declare double @llvm.vector.reduce.fmax.nxv4f64() +declare double @llvm.vector.reduce.fmin.nxv4f64() +declare i64 @llvm.vector.reduce.umin.nxv4i64() +declare i64 @llvm.vector.reduce.umax.nxv4i64() +declare i64 @llvm.vector.reduce.smin.nxv4i64() +declare i64 @llvm.vector.reduce.smax.nxv4i64() +declare double @llvm.vector.reduce.fadd.nxv4f64(double, )