[TTI] BasicTTI::getInterleavedMemoryOpCost(): use getScalarizationOve…

…rhead() getScalarizationOverhead() results in a somewhat better cost estimation than counting the insertion/extraction costs directly. Notably, this is still overestimating the costs. Original Patch by: @lebedev.ri (Roman Lebedev) Differential Revision: https://reviews.llvm.org/D110713
llvm · Sep 29, 2021 · 17f1fc1 · 17f1fc1
1 parent 7674bd4
commit 17f1fc1
Show file tree

Hide file tree

Showing 14 changed files with 132 additions and 116 deletions.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1237,6 +1237,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // Then plus the cost of interleave operation.
     assert(Indices.size() <= Factor &&
            "Interleaved memory op has too many members");
+
+    APInt DemandedLoadStoreElts = APInt::getNullValue(NumElts);
+    for (unsigned Index : Indices) {
+      assert(Index < Factor && "Invalid index for interleaved memory op");
+      for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
+        DemandedLoadStoreElts.setBit(Index + Elm * Factor);
+    }
+
     if (Opcode == Instruction::Load) {
       // The interleave cost is similar to extract sub vectors' elements
       // from the wide vector, and insert them into sub vectors.
@@ -1246,21 +1254,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
       // <8 x i32> vector and insert them into a <4 x i32> vector.
-      for (unsigned Index : Indices) {
-        assert(Index < Factor && "Invalid index for interleaved memory op");
-
-        // Extract elements from loaded vector for each sub vector.
-        for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
-          Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
-                                              Index + Elm * Factor);
-      }
-
-      InstructionCost InsSubCost = 0;
-      for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
-        InsSubCost +=
-            thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, Elm);
-
+      InstructionCost InsSubCost =
+          getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false);
       Cost += Indices.size() * InsSubCost;
+      Cost +=
+          thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
+                                            /*Insert*/ false, /*Extract*/ true);
     } else {
       // The interleave cost is extract elements from sub vectors, and
       // insert them into the wide vector.
@@ -1275,20 +1274,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       // The cost is estimated as extract all elements (of actual members,
       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
       // i32> vector.
-      InstructionCost ExtSubCost = 0;
-      for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
-        ExtSubCost += thisT()->getVectorInstrCost(Instruction::ExtractElement,
-                                                  SubVT, Elm);
+      InstructionCost ExtSubCost =
+          getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
       Cost += ExtSubCost * Indices.size();
-
-      for (unsigned Index : Indices) {
-        assert(Index < Factor && "Invalid index for interleaved memory op");
-
-        // Insert elements from loaded vector for each sub vector.
-        for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
-          Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VT,
-                                              Index + Elm * Factor);
-      }
+      Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
+                                                /*Insert*/ true,
+                                                /*Extract*/ false);
     }
 
     if (!UseMaskForCond)
@@ -1308,13 +1299,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // The cost is estimated as extract all mask elements from the <8xi1> mask
     // vector and insert them factor times into the <24xi1> shuffled mask
     // vector.
-    for (unsigned i = 0; i < NumSubElts; i++)
-      Cost +=
-          thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
-
-    for (unsigned i = 0; i < NumElts; i++)
-      Cost +=
-          thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
+    Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
+    Cost +=
+        getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false);
 
     // The Gaps mask is invariant and created outside the loop, therefore the
     // cost of creating it is not accounted for here. However if we have both

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
@@ -22,8 +22,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 41 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
@@ -22,15 +22,15 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 31 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 58 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 171 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 342 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 31 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 58 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 171 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 342 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 129 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 258 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
@@ -22,8 +22,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 41 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 82 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 456 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -21,13 +21,13 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 99 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 285 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 99 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 285 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
@@ -21,7 +21,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 31 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 58 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 123 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 342 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
@@ -13,24 +13,24 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 20 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 160 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 478 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
+; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
+; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
+; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
+; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX1: LV: Found an estimated cost of 81 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 226 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 33 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 81 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 226 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX2: LV: Found an estimated cost of 166 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
@@ -21,9 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v1, i16* %out1, align 2
 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i16 %v1, i16* %out1, align 2
 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 49 for VF 8 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX1: LV: Found an estimated cost of 35 for VF 8 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction:   store i16 %v1, i16* %out1, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v1, i16* %out1, align 2
 ; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %v1, i16* %out1, align 2

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
@@ -20,17 +20,17 @@ target triple = "x86_64-unknown-linux-gnu"
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
 ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 66 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 171 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 342 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX1: LV: Found an estimated cost of 53 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
 ; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 66 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 171 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 342 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX2: LV: Found an estimated cost of 30 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX2: LV: Found an estimated cost of 53 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX2: LV: Found an estimated cost of 129 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX2: LV: Found an estimated cost of 258 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
 ; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2

diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
@@ -20,10 +20,10 @@ target triple = "x86_64-unknown-linux-gnu"
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v3, i16* %out3, align 2
 ; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 49 for VF 4 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 98 for VF 8 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 456 for VF 32 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX1: LV: Found an estimated cost of 70 for VF 8 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction:   store i16 %v3, i16* %out3, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v3, i16* %out3, align 2
 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   store i16 %v3, i16* %out3, align 2