[RISCV][CostModel] Updates reduction and shuffle cost #77342

arcbbb · 2024-01-08T17:01:06Z

Make andi cost 1 in SK_Broadcast
Query the cost of VID_V, VRSUB_VX/VRSUB_VI which would scale with LMUL

llvmbot · 2024-01-08T17:01:25Z

@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-backend-risc-v

Author: Shih-Po Hung (arcbbb)

Changes

Make VMV_S_* and VMV_*_S cost independent of LMUL
Uses getRISCVInstructionCost() in reduction cost Add SplitCost for lmul larger than 8. e.g. The cost of vredsum on [vscale x 16 x i64] will be the cost of vadd on [vscale x 8 x i64] plus the cost of vredsum on [vscale x 8 x i64].

Patch is 344.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77342.diff

17 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+107-38)
(modified) llvm/test/Analysis/CostModel/RISCV/reduce-add.ll (+51-51)
(modified) llvm/test/Analysis/CostModel/RISCV/reduce-and.ll (+35-35)
(modified) llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll (+54-54)
(modified) llvm/test/Analysis/CostModel/RISCV/reduce-max.ll (+70-70)
(modified) llvm/test/Analysis/CostModel/RISCV/reduce-min.ll (+70-70)
(modified) llvm/test/Analysis/CostModel/RISCV/reduce-or.ll (+57-57)
(modified) llvm/test/Analysis/CostModel/RISCV/reduce-scalable-fp.ll (+73-73)
(modified) llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll (+114-114)
(modified) llvm/test/Analysis/CostModel/RISCV/reduce-xor.ll (+51-51)
(modified) llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll (+4-4)
(modified) llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll (+14-14)
(modified) llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll (+6-6)
(modified) llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll (+38-38)
(modified) llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll (+6-6)
(modified) llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll (+4-4)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll (+10-10)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index b3916c98700519..6c143a762b0c03 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -46,6 +46,9 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
   InstructionCost Cost = 0;
   for (auto Op : OpCodes) {
     switch (Op) {
+    case RISCV::SLT:
+      Cost += 1;
+      break;
     case RISCV::VRGATHER_VI:
       Cost += TLI->getVRGatherVICost(VT);
       break;
@@ -84,8 +87,14 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
       Cost += VL;
       break;
     }
+    case RISCV::VMV_X_S:
+    case RISCV::VFMV_F_S:
+      Cost += 1;
+      break;
     case RISCV::VMV_S_X:
-      // FIXME: VMV_S_X doesn't use LMUL, the cost should be 1
+    case RISCV::VFMV_S_F:
+      Cost += 1;
+      break;
     default:
       Cost += LMULCost;
     }
@@ -444,9 +453,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     // vmv.s.x      v0, a0
     // vmerge.vvm   v8, v9, v8, v0
     return LT.first *
-           (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for li
-            getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
-                                    LT.second, CostKind));
+           (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
+                                        LT.second, CostKind));
   }
   case TTI::SK_Broadcast: {
     bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
@@ -459,9 +467,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         //   vmv.v.x v8, a0
         //   vmsne.vi v0, v8, 0
         return LT.first *
-               (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi
-                getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
-                                        LT.second, CostKind));
+               (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
+                                            LT.second, CostKind));
       }
       // Example sequence:
       //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)
@@ -473,12 +480,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       //   vmsne.vi  v0, v8, 0
 
       return LT.first *
-             (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi
-              TLI->getLMULCost(
-                  LT.second) + // FIXME: vmv.x.s is the same as extractelement
-              getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
-                                       RISCV::VMV_V_X, RISCV::VMSNE_VI},
-                                      LT.second, CostKind));
+             (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
+                                           RISCV::VMV_X_S, RISCV::VMV_V_X,
+                                           RISCV::VMSNE_VI},
+                                          LT.second, CostKind));
     }
 
     if (HasScalar) {
@@ -523,9 +528,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     if (LT.second.isFixedLengthVector())
       // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
       LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
-    // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
-    InstructionCost GatherCost =
-        2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
+    InstructionCost GatherCost = getRISCVInstructionCost(
+        {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}, LT.second,
+        CostKind);
     // Mask operation additionally required extend and truncate
     InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
     return LT.first * (LenCost + GatherCost + ExtendCost);
@@ -1358,19 +1363,53 @@ RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
     return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
 
   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
-  if (Ty->getElementType()->isIntegerTy(1))
-    // vcpop sequences, see vreduction-mask.ll.  umax, smin actually only
-    // cost 2, but we don't have enough info here so we slightly over cost.
-    return (LT.first - 1) + 3;
+  std::array<unsigned, 3> Opcodes;
+  if (Ty->getElementType()->isIntegerTy(1)) {
+    // vcpop sequences, see vreduction-mask.ll.
+    if ((IID == Intrinsic::umax) || (IID == Intrinsic::smin))
+      Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M, RISCV::SLT};
+    else
+      Opcodes = {RISCV::VCPOP_M, RISCV::SLT};
+    return (LT.first - 1) +
+           getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+  }
 
   // IR Reduction is composed by two vmv and one rvv reduction instruction.
-  InstructionCost BaseCost = 2;
-
-  if (CostKind == TTI::TCK_CodeSize)
-    return (LT.first - 1) + BaseCost;
-
-  unsigned VL = getEstimatedVLFor(Ty);
-  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
+  unsigned SplitOp;
+  switch (IID) {
+  default:
+    llvm_unreachable("Unsupported intrinsic");
+  case Intrinsic::smax:
+    SplitOp = RISCV::VMAX_VV;
+    Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
+    break;
+  case Intrinsic::smin:
+    SplitOp = RISCV::VMIN_VV;
+    Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
+    break;
+  case Intrinsic::umax:
+    SplitOp = RISCV::VMAXU_VV;
+    Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
+    break;
+  case Intrinsic::umin:
+    SplitOp = RISCV::VMINU_VV;
+    Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
+    break;
+  case Intrinsic::maxnum:
+    SplitOp = RISCV::VFMAX_VV;
+    Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
+    break;
+  case Intrinsic::minnum:
+    SplitOp = RISCV::VFMIN_VV;
+    Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
+    break;
+  }
+  // Add a cost for data larger than LMUL8
+  InstructionCost SplitCost =
+      (LT.first > 1) ? (LT.first - 1) *
+                           getRISCVInstructionCost(SplitOp, LT.second, CostKind)
+                     : 0;
+  return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
 }
 
 InstructionCost
@@ -1392,20 +1431,50 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
 
   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
-  if (Ty->getElementType()->isIntegerTy(1))
+  std::array<unsigned, 3> Opcodes;
+  if (Ty->getElementType()->isIntegerTy(1)) {
     // vcpop sequences, see vreduction-mask.ll
-    return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
+    if (ISD == ISD::AND)
+      Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M, RISCV::SLT};
+    else
+      Opcodes = {RISCV::VCPOP_M, RISCV::SLT};
+    return (LT.first - 1) +
+           getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+  }
 
   // IR Reduction is composed by two vmv and one rvv reduction instruction.
-  InstructionCost BaseCost = 2;
-
-  if (CostKind == TTI::TCK_CodeSize)
-    return (LT.first - 1) + BaseCost;
-
-  unsigned VL = getEstimatedVLFor(Ty);
-  if (TTI::requiresOrderedReduction(FMF))
-    return (LT.first - 1) + BaseCost + VL;
-  return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
+  unsigned SplitOp;
+  switch (ISD) {
+  case ISD::ADD:
+    SplitOp = RISCV::VADD_VV;
+    Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
+    break;
+  case ISD::OR:
+    SplitOp = RISCV::VOR_VV;
+    Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
+    break;
+  case ISD::XOR:
+    SplitOp = RISCV::VXOR_VV;
+    Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
+    break;
+  case ISD::AND:
+    SplitOp = RISCV::VAND_VV;
+    Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
+    break;
+  case ISD::FADD:
+    SplitOp = RISCV::VFADD_VV;
+    if (TTI::requiresOrderedReduction(FMF))
+      Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDOSUM_VS, RISCV::VFMV_F_S};
+    else
+      Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
+    break;
+  }
+  // Add a cost for data larger than LMUL8
+  InstructionCost SplitCost =
+      (LT.first > 1) ? (LT.first - 1) *
+                           getRISCVInstructionCost(SplitOp, LT.second, CostKind)
+                     : 0;
+  return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
 }
 
 InstructionCost RISCVTTIImpl::getExtendedReductionCost(
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
index 6fe098628ea078..ed9d71cad0be61 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-add.ll
@@ -6,25 +6,25 @@
 
 define i32 @reduce_i1(i32 %arg) {
 ; CHECK-LABEL: 'reduce_i1'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.add.v2i1(<2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.add.v2i1(<2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_i1'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.add.v2i1(<2 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.add.v2i1(<2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.add.v64i1(<64 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i1 @llvm.vector.reduce.add.v128i1(<128 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> undef)
@@ -51,14 +51,14 @@ define i32 @reduce_i8(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_i8'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1   = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
@@ -85,14 +85,14 @@ define i32 @reduce_i16(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_i16'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1   = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
@@ -115,18 +115,18 @@ define i32 @reduce_i32(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIZE-LABEL: 'reduce_i32'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for i...
[truncated]

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

wangpc-pp

LGTM with nits.

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

topperc · 2024-01-11T04:59:39Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

+      Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
+      return (LT.first - 1) +
+             getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
+             getCmpSelInstrCost(Instruction::Select, ElementTy, ElementTy,


Shouldn't this be Instruction::ICmp?

Fixed. Thanks!

topperc · 2024-01-11T05:09:53Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

+  case ISD::FADD:
+    SplitOp = RISCV::VFADD_VV;
+    if (TTI::requiresOrderedReduction(FMF))
+      Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDOSUM_VS, RISCV::VFMV_F_S};


Ordered reductions don't split the same way. They chain a series of VFREDOSUMs.

Thanks for pointing out this! Fixed.

This is inspired by llvm#77342 (review), and is split off of same with some differences in style. A select is a vmerge.vv with the additional cost of materializing the bitmask vector in a vreg. All masks fit within a single vector register (e8 + m8 is the worst case), and thus our worst case cost should be roughly 3 (2 scalar to produce the address, one vector load op). Given most shuffles are small, and the mask will be instead produced by LUI/ADDI + vmv.s.x or ADDI + vmv.s.x, using 2 as the default seems quite reasonable. At worst, we're not going to be off by much. The prior lowering scaled the cost of the bitmask with LMUL, which I don't understand. At m1 it did use the same base cost of 2.

preames · 2024-01-12T19:15:13Z

I split one piece of this off in #77963.

For this to make rapid progress, I recommend splitting it into individual pieces to the maximum extent possible.

@lukel97

) This is inspired by #77342 (review), and is split off of same with some differences in style. A select is a vmerge.vv with the additional cost of materializing the bitmask vector in a vreg. All masks fit within a single vector register (e8 + m8 is the worst case), and thus our worst case cost should be roughly 3 (2 scalar to produce the address, one vector load op). Given most shuffles are small, and the mask will be instead produced by LUI/ADDI + vmv.s.x or ADDI + vmv.s.x, using 2 as the default seems quite reasonable. At worst, we're not going to be off by much. The prior lowering scaled the cost of the bitmask with LMUL, which I don't understand. At m1 it did use the same base cost of 2. (@lukel97 You wrote the original code here, anything I'm missing here?)

lukel97 · 2024-01-19T00:02:23Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

+    case RISCV::VMV_X_S:
+    case RISCV::VFMV_F_S:
    case RISCV::VMV_S_X:
-      // FIXME: VMV_S_X doesn't use LMUL, the cost should be 1
+    case RISCV::VFMV_S_F:
+      Cost += 1;
+      break;


Can we split this off into a separate PR? I think it would be useful to be able to see if this has any effect on the current cost model tests.

Sure, I split it in #78739. will update this PR after that.

preames · 2024-01-22T15:39:33Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

-    InstructionCost GatherCost =
-        2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
+    InstructionCost GatherCost = getRISCVInstructionCost(
+        {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}, LT.second,


This bit isn't right. In particular, it ignore the code two lines above which talks about different cases for costing the index sequence.

I made a change to query the cost of VRSUB_VX/VRSUB_VI separately. Not sure this is what you expected.

preames · 2024-01-22T15:40:36Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

-    // vcpop sequences, see vreduction-mask.ll.  umax, smin actually only
-    // cost 2, but we don't have enough info here so we slightly over cost.
-    return (LT.first - 1) + 3;
+  if (Ty->getElementType()->isIntegerTy(1)) {


If you separate this as it's own patch, should be easy to review.

It is split off in #79401

preames · 2024-01-22T15:47:57Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

-  if (CostKind == TTI::TCK_CodeSize)
-    return (LT.first - 1) + BaseCost;
-
-  unsigned VL = getEstimatedVLFor(Ty);


The net effect of this change appears to be to increase BaseCost by 1, make the reduction linear in lmul, and remove the log2(VL) term.

This bit (maybe with the other reduction change) needs to be it's own review so we can clearly see the effect. This one (in particular), you should also try running through some reasonable large codebase (llvm test-suite, spec, whatever) and checking for unexpected interactions.

OK! I split this off into #79103.

This patch is split off from #77342 - Correct for CodeSize cost that 1 instruction is not included. 3 is from {VMV.S, ReductionOp, VMV.X} - Add SplitCost Unordered reduction chain a series of VADD/VFADD/... which scales with LMUL. Ordered reductions chain a series of VFREDOSUMs. - Use MVT to estimate VL.

This patch is split off from llvm#77342, and follows llvm#79103 - Correct for CodeSize cost that 1 instruction is not included. 3 is from {VMV.S, ReductionOp, VMV.X} - Add SplitCost which chains a series of VMAX/VMIN/... which scales with LMUL. - Use MVT to estimate VL.

It is split off from #77342. InstCombine transform min/max reduction with i1 into arithmetic reduction, so this patch reuses the cost logic in arithmetic reduction cost function.

preames · 2024-01-29T15:50:15Z

After #79402 lands, I think all the pieces of this have gone in right? If not, can you rebase so what's left is visible?

This patch is split off from #77342, and follows #79103 - Correct for CodeSize cost that 1 instruction is not included. 3 is from {VMV.S, ReductionOp, VMV.X} - Add SplitCost which chains a series of VMAX/VMIN/... which scales with LMUL. - Use MVT to estimate VL.

- Make `andi` cost 1 in SK_Broadcast - Query the cost of VID_V, VRSUB_VX which would scale with LMUL

preames · 2024-01-30T15:12:38Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

@@ -482,11 +481,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
      //   vmsne.vi  v0, v8, 0


Unrelated codegen opportunity

vmv.x.s a0, v8
andi a0, a0, 1
vmv.v.x v8, a0

Could be a vrgather.vi

preames · 2024-01-30T15:13:26Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

@@ -482,11 +481,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
      //   vmsne.vi  v0, v8, 0

      return LT.first *
-             (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi


If the index is non-zero, then there's a vslide needed in the current sequence which does scale with LMUL.

So, this change is wrong for non-zero index.

For SK_Broadcast, my understanding is that the index should always be the first element of the source vector.
I was based on this

if (Shuffle->isZeroEltSplat()) return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, Shuffle->getShuffleMask(), CostKind, 0, nullptr, Operands);

Could we clarify this?"

It looks like you're correct here. There are other cases which create SK_Broadcast, but from what I can tell, all are consistent with the splat index always being zero.

To chime in here, it's defined in TargetTransformInfo.h as being a broadcast of element zero:

enum ShuffleKind { SK_Broadcast, ///< Broadcast element 0 to all other elements.

preames · 2024-01-30T15:16:39Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

-    // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
-    InstructionCost GatherCost =
-        2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
+      GatherCost = getRISCVInstructionCost(


Please common the tail here, and undo the change which is restructuring the existing LenCost computation since that appears to be non-functional in your diff.

preames · 2024-01-30T15:17:36Z

Given you've now reduced the scope of this, you need to update the review description so that it matches the current patch.

arcbbb · 2024-01-31T02:14:27Z

Given you've now reduced the scope of this, you need to update the review description so that it matches the current patch.

Updated, thanks!

preames · 2024-02-01T21:00:10Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

@@ -531,9 +529,12 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
    if (LT.second.isFixedLengthVector())
      // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
      LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
-    // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
+    unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV : VRGATHER_VV};


"ISCV : VRGATHER_VV" - Does this build? Shouldn't there be a second :?

The cost sequence here is now ignoring the LI for the VX case.

Please, don't change LenCost. It's correct. Adjust only the other bits to make your fix.

Maybe I overlooked something, here are the three cost sequences I am considering:

scalable vector
LenCost:3 from { csrr a0, vlenb, srli a0, a0, 3, addi a0, a0, -1 }
GatherCost from {vid.v, vrsub.vx, vrgather.vv}

fixed-length vector, imm > 5bit
LenCost:1 from { li }
GatherCost from {vid.v, vrsub.vx, vrgather.vv}

fixed-length vector, imm <= 5 bit
LenCost:0 from { }
GatherCost from {vid.v, vrsub.vi, vrgather.vv}

Could you help me correct this?

@preames I can't see where LenCost is changed in this branch. I think you might be looking at an earlier version of the diff before this force push? 62be3d6

lukel97 · 2024-02-02T03:47:58Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

+    unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
+    if (LT.second.isFixedLengthVector() &&
+        isInt<5>(LT.second.getVectorNumElements() - 1))
+      Opcodes[1] = RISCV::VRSUB_VI;


Just to double check, we don't cost VRSUB_VI any differently from VRSUB_VX do we?

Yes, the cost for both is the same.

lukel97 · 2024-02-02T04:00:23Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

@@ -531,9 +529,12 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
    if (LT.second.isFixedLengthVector())
      // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
      LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
-    // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX}
+    unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV : VRGATHER_VV};


@preames I can't see where LenCost is changed in this branch. I think you might be looking at an earlier version of the diff before this force push? 62be3d6

lukel97 · 2024-02-02T04:02:00Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

@@ -482,11 +481,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
      //   vmsne.vi  v0, v8, 0

      return LT.first *
-             (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi


To chime in here, it's defined in TargetTransformInfo.h as being a broadcast of element zero:

enum ShuffleKind { SK_Broadcast, ///< Broadcast element 0 to all other elements.

arcbbb · 2024-02-15T16:51:01Z

@preames ping

lukel97

LGTM

arcbbb · 2024-02-26T09:19:50Z

I am going to merge this if no further inputs. Thanks!

arcbbb added backend:RISC-V llvm:analysis labels Jan 8, 2024

arcbbb requested review from preames, lukel97, topperc and wangpc-pp January 8, 2024 17:01

llvmbot added the llvm:transforms label Jan 8, 2024

wangpc-pp reviewed Jan 9, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Outdated Show resolved Hide resolved

wangpc-pp reviewed Jan 10, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Outdated Show resolved Hide resolved

wangpc-pp approved these changes Jan 11, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Outdated Show resolved Hide resolved

topperc reviewed Jan 11, 2024

View reviewed changes

preames mentioned this pull request Jan 12, 2024

[RISCV] Adjust select shuffle cost to reflect mask creation cost #77963

Merged

lukel97 reviewed Jan 19, 2024

View reviewed changes

preames reviewed Jan 22, 2024

View reviewed changes

arcbbb mentioned this pull request Jan 23, 2024

[RISCV][CostModel] Refine Arithmetic reduction costs #79103

Merged

arcbbb mentioned this pull request Jan 25, 2024

[RISCV] Refine cost on Min/Max reduction with i1 type #79401

Merged

arcbbb mentioned this pull request Jan 25, 2024

[RISCV] Refine cost on Min/Max reduction #79402

Merged

[RISCV][CostModel] Improve cost for SK_Broadcast and SK_Reverse

890f7da

- Make `andi` cost 1 in SK_Broadcast - Query the cost of VID_V, VRSUB_VX which would scale with LMUL

arcbbb force-pushed the cost-reduction branch from 8c9ec5d to 890f7da Compare January 30, 2024 09:16

preames requested changes Jan 30, 2024

View reviewed changes

Add check for VRSUB_VI

5ee789e

arcbbb force-pushed the cost-reduction branch from 62be3d6 to 5ee789e Compare January 31, 2024 02:12

preames reviewed Feb 1, 2024

View reviewed changes

Fix typo

1174057

lukel97 reviewed Feb 2, 2024

View reviewed changes

lukel97 approved these changes Feb 19, 2024

View reviewed changes

arcbbb merged commit 6ee9c8a into llvm:main Feb 29, 2024
3 of 4 checks passed

arcbbb deleted the cost-reduction branch February 29, 2024 07:41

		@@ -482,11 +481,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
		// vmsne.vi v0, v8, 0

[RISCV][CostModel] Updates reduction and shuffle cost #77342

[RISCV][CostModel] Updates reduction and shuffle cost #77342

Conversation

arcbbb commented Jan 8, 2024 • edited

llvmbot commented Jan 8, 2024 • edited

wangpc-pp left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

preames commented Jan 12, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

preames commented Jan 29, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

preames commented Jan 30, 2024

arcbbb commented Jan 31, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

arcbbb commented Feb 15, 2024

lukel97 left a comment

Choose a reason for hiding this comment

arcbbb commented Feb 26, 2024

arcbbb commented Jan 8, 2024 •

edited

llvmbot commented Jan 8, 2024 •

edited