[CostModel][X86] Fix overcounting arithmetic cost in illegal types in…

… getArithmeticReductionCost/getMinMaxReductionCost We were overcounting the number of arithmetic operations needed at each level before we reach a legal type. We were using the full vector type for that level, but we are going to split the input vector at that level in half. So the effective arithmetic operation cost at that level is half the width. So for example on 8i32 on an sse target. Were were calculating the cost of an 8i32 op which is likely 2 for basic integer. Then after the loop we count 2 more v4i32 ops. For a total arith cost of 4. But if you look at the assembly there would only be 3 arithmetic ops. There are still more bugs in this code that I'm going to work on next. The non pairwise code shouldn't count extract subvectors in the loop. There are no extracts, the types are split in registers. For pairwise we need to use 2 two src permute shuffles. Differential Revision: https://reviews.llvm.org/D55397 llvm-svn: 348621
llvm · Dec 7, 2018 · d1498ed · d1498ed
1 parent ba3ab78
commit d1498ed
Show file tree

Hide file tree

Showing 22 changed files with 1,053 additions and 1,369 deletions.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1431,7 +1431,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
                                                  NumVecElts, SubTy);
-      ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
+      ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, SubTy);
       Ty = SubTy;
       ++LongVectorCount;
     }
@@ -1476,16 +1476,17 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
       Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
+      CondTy = VectorType::get(ScalarCondTy, NumVecElts);
+
       // Assume the pairwise shuffles add a cost.
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
                                                  NumVecElts, SubTy);
       MinMaxCost +=
-          ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
-          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+          ConcreteTTI->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, nullptr) +
+          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
                                           nullptr);
       Ty = SubTy;
-      CondTy = VectorType::get(ScalarCondTy, NumVecElts);
       ++LongVectorCount;
     }
     // The minimal length of the vector is limited by the real length of vector

diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add-widen.ll b/llvm/test/Analysis/CostModel/X86/reduce-add-widen.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and-widen.ll b/llvm/test/Analysis/CostModel/X86/reduce-and-widen.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul-widen.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul-widen.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or-widen.ll b/llvm/test/Analysis/CostModel/X86/reduce-or-widen.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smax-widen.ll b/llvm/test/Analysis/CostModel/X86/reduce-smax-widen.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smin-widen.ll b/llvm/test/Analysis/CostModel/X86/reduce-smin-widen.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umax-widen.ll b/llvm/test/Analysis/CostModel/X86/reduce-umax-widen.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umin-widen.ll b/llvm/test/Analysis/CostModel/X86/reduce-umin-widen.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-xor-widen.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor-widen.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -12,8 +12,8 @@
 
 ; Vector cost is 5, Scalar cost is 7
 ; AVX: Adding cost -2 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
-; Vector cost is 7, Scalar cost is 7
-; SSE: Adding cost 0 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+; Vector cost is 6, Scalar cost is 7
+; SSE: Adding cost -1 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
 define i32 @test_add(i32* nocapture readonly %p) {
 ; CHECK-LABEL: @test_add(
 ; CHECK-NEXT:  entry: