[RISCV] Add basic code modeling for fixed length vector reduction.

Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D121447
llvm · Mar 14, 2022 · ae7c664 · ae7c664
1 parent eeb3bfd
commit ae7c664
Show file tree

Hide file tree

Showing 9 changed files with 1,589 additions and 4 deletions.
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -217,6 +217,64 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
   return NumLoads * MemOpCost;
 }
 
+InstructionCost
+RISCVTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+                                     bool IsUnsigned,
+                                     TTI::TargetCostKind CostKind) {
+  // FIXME: Only supporting fixed vectors for now.
+  if (!isa<FixedVectorType>(Ty))
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
+
+  if (!ST->useRVVForFixedLengthVectors())
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
+
+  // Skip if scalar size of Ty is bigger than ELEN.
+  if (Ty->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
+
+  // IR Reduction is composed by two vmv and one rvv reduction instruction.
+  InstructionCost BaseCost = 2;
+  unsigned VL = cast<FixedVectorType>(Ty)->getNumElements();
+  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
+}
+
+InstructionCost
+RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *VTy,
+                                         Optional<FastMathFlags> FMF,
+                                         TTI::TargetCostKind CostKind) {
+  // FIXME: Only supporting fixed vectors for now.
+  if (!isa<FixedVectorType>(VTy))
+    return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);
+
+  // FIXME: Do not support i1 and/or reduction now.
+  if (VTy->getElementType()->isIntegerTy(1))
+    return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);
+
+  if (!ST->useRVVForFixedLengthVectors())
+    return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);
+
+  // Skip if scalar size of VTy is bigger than ELEN.
+  if (VTy->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+    return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
+      ISD != ISD::FADD)
+    return BaseT::getArithmeticReductionCost(Opcode, VTy, FMF, CostKind);
+
+  // IR Reduction is composed by two vmv and one rvv reduction instruction.
+  InstructionCost BaseCost = 2;
+  unsigned VL = cast<FixedVectorType>(VTy)->getNumElements();
+  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VTy);
+
+  if (TTI::requiresOrderedReduction(FMF))
+    return (LT.first - 1) + BaseCost + VL;
+  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
+}
+
 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                            TTI::UnrollingPreferences &UP,
                                            OptimizationRemarkEmitter *ORE) {

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -88,6 +88,14 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I);
 
+  InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+                                         bool IsUnsigned,
+                                         TTI::TargetCostKind CostKind);
+
+  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                             Optional<FastMathFlags> FMF,
+                                             TTI::TargetCostKind CostKind);
+
   bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
     if (!ST->hasVInstructions())
       return false;

diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=256 -passes='print<cost-model>' -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefix=RISCV32
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 -passes='print<cost-model>' -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefix=RISCV64
+
+define i32 @reduce_i8(i32 %arg) {
+; RISCV32-LABEL: 'reduce_i8'
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; RISCV64-LABEL: 'reduce_i8'
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %V1   = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
+  %V2   = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+  %V4   = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+  %V8   = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; RISCV32-LABEL: 'reduce_i16'
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; RISCV64-LABEL: 'reduce_i16'
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %V1   = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
+  %V2   = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+  %V4   = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+  %V8   = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+  %V16  = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+  %V32  = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+  %V64  = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+  %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; RISCV32-LABEL: 'reduce_i32'
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; RISCV64-LABEL: 'reduce_i32'
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %V1   = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
+  %V2   = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+  %V4   = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+  %V8   = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+  %V16  = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+  %V32  = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+  %V64  = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
+  %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i64(i32 %arg) {
+; RISCV32-LABEL: 'reduce_i64'
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
+; RISCV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; RISCV64-LABEL: 'reduce_i64'
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
+; RISCV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %V1   = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+  %V2   = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+  %V4   = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+  %V8   = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+  %V16  = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+  %V32  = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
+  %V64  = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
+  %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
+  ret i32 undef
+}
+
+declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
+declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
+declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
+declare i32 @llvm.vector.reduce.add.v128i32(<128 x i32>)
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
+declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
+declare i64 @llvm.vector.reduce.add.v128i64(<128 x i64>)